from selenium import webdriver
import time
from bs4 import BeautifulSoup
import requests
import csv

# 中国知网主页面
url = "https://www.cnki.net/"
# 查询内容
wd = '四轴无人机'
# 打开文件
file = open('pdf_href.csv', 'a', encoding='utf-8', newline='')
# 写入到表格文件
csv_file = csv.writer(file)
# 定义表格第一行内容
csv_file.writerow(['标题','链接'])

driver = webdriver.Chrome()
#请求
driver.get(url)
time.sleep(5)
txt_SearchText = driver.find_element_by_id('txt_SearchText').send_keys(wd)
time.sleep(1)
button = driver.find_element_by_css_selector('body > div.wrapper.section1 > div.searchmain > div > div.input-box > input.search-btn')
button.click()
time.sleep(5)

# 获取cookie，
res = driver.get_cookies()
# 拼接格式
cookies = ''.join([str(i['name']) +'='+ str(i['value'])+';' for i in res])

# 下载计数
num = 0
while True:

    html = driver.page_source
    time.sleep(1)
    soup = BeautifulSoup(html, 'html.parser')
    #获取页数
    pn = soup.find('span',attrs={"class":"total"}).text
    cur = soup.find('span',attrs={"class":"cur"}).text # 当前第几页
    print('当前页数：{}， 当前第{}页'.format(pn, cur))

    tr_list = soup.find('table', attrs={"class":"result-table-list"}).find_all('tr', attrs={"class":"odd"})
    # print(tr_list)
    print('*'*100)
    for td in tr_list:
        res_a = td.find('td', attrs={"class":"name"}).find('a')
        file_name = res_a.text.strip()
        href = 'https://kns.cnki.net' + res_a.get('href')
        print('访问子页面：',href)
        headers = {
            "User-Agent": "Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/91.0.4472.124 Safari/537.36",
            "Referer":"https://kns.cnki.net/kns8/defaultresult/index",
            "Host": "kns.cnki.net",
            "cookie":cookies
        }
        response = requests.get(href, headers=headers).text
        soup_pdf = BeautifulSoup(response, 'html.parser')
        pdfDown = soup_pdf.find('a', attrs={"id":"pdfDown"})
        if pdfDown:
            pdfDown_href = 'https://kns.cnki.net' + pdfDown.get('href').strip()
            print('文件名称：{}， 完整下载链接：{}'.format(file_name, pdfDown_href))
            csv_file.writerow([file_name, pdfDown_href])
            print('当前下载第{}个文件'.format(num))
            num += 1
        print('*'*100)
        time.sleep(3)
    # 下一页
    try:
        PageNext = driver.find_element_by_css_selector('#PageNext')
        print('================================获取下一页，点击下一页================================')
        if PageNext:
            PageNext.click()
            time.sleep(5)  # 翻页后等待三秒刷新
    except:
        print('未发现下一页，跳出循环，关闭浏览器')
        break
# 下载完毕 关闭文件
file.close()
driver.close()